{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 所有特征\n",
    "\n",
    "## 问题特征\n",
    "\n",
    "- 问题出现次数: 1\n",
    "- 问题单词数量: 2\n",
    "- 问题字符数量: 2\n",
    "- 问题Hash值: 2\n",
    "\n",
    "## 问题对特征\n",
    "\n",
    "- 问题对重复单词数量: 1\n",
    "- 问题对重复字符数量: 1\n",
    "\n",
    "## 图特征\n",
    "\n",
    "- Clique Size, 与此问题对相互毗邻结点组成的子图中结点的数量: 1\n",
    "- K-core, 每个点最大的K-core值: 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import networkx as nx\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from itertools import combinations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "DATA_PATH = \"./data/\"\n",
    "TRAIN_PATH = DATA_PATH + \"train.csv\"\n",
    "TEST_PATH = DATA_PATH + \"test.csv\"\n",
    "WORD_EMBED_PATH = DATA_PATH + \"word_embed.txt\"\n",
    "CHAR_EMBED_PATH = DATA_PATH + \"char_embed.txt\"\n",
    "QUEST_PATH = DATA_PATH + \"question.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_data = pd.read_csv(TRAIN_PATH)\n",
    "test_data = pd.read_csv(TEST_PATH)\n",
    "question_data = pd.read_csv(QUEST_PATH)\n",
    "word_embedding_data = pd.read_csv(WORD_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
    "char_embedding_data = pd.read_csv(CHAR_EMBED_PATH, delimiter=\" \", header=None, index_col=0)\n",
    "\n",
    "question_data[\"words\"] = question_data[\"words\"].str.split(\" \")\n",
    "question_data[\"chars\"] = question_data[\"chars\"].str.split(\" \")\n",
    "\n",
    "label = train_data[\"label\"].values.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "total_question = pd.concat([train_data[\"q1\"], train_data[\"q2\"], test_data[\"q1\"], test_data[\"q2\"]])\n",
    "question_feature = total_question.value_counts().reset_index()\n",
    "question_feature.columns = [\"qid\", \"q_count\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "unique_question = total_question.drop_duplicates().reset_index(drop=True)\n",
    "question_dict = pd.Series(unique_question.index, unique_question).to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras.preprocessing.text import Tokenizer\n",
    "\n",
    "word_tokenizer = Tokenizer()\n",
    "word_tokenizer.fit_on_texts(question_data[\"words\"])\n",
    "char_tokenizer = Tokenizer()\n",
    "char_tokenizer.fit_on_texts(question_data[\"chars\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "word_count = sorted(list(word_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)\n",
    "word_count = pd.DataFrame(word_count, columns=[\"word\", \"word_times\"])\n",
    "char_count = sorted(list(char_tokenizer.word_counts.items()), key=lambda x: x[1], reverse=True)\n",
    "char_count = pd.DataFrame(char_count, columns=[\"cahr\", \"char_times\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = train_data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
    "                .drop([\"qid\", \"label\"], axis=1) \\\n",
    "                .rename(columns={\"words\": \"words1\", \"chars\": \"chars1\"}) \\\n",
    "                .merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
    "                .drop([\"qid\"], axis=1) \\\n",
    "                .rename(columns={\"words\": \"words2\", \"chars\": \"chars2\"})\n",
    "test = test_data.merge(question_data, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
    "                .drop([\"qid\"], axis=1) \\\n",
    "                .rename(columns={\"words\": \"words1\", \"chars\": \"chars1\"}) \\\n",
    "                .merge(question_data, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
    "                .drop([\"qid\"], axis=1) \\\n",
    "                .rename(columns={\"words\": \"words2\", \"chars\": \"chars2\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train1 = train.merge(question_feature, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
    "    .drop(\"qid\", axis=1) \\\n",
    "    .rename(columns={\"q_count\": \"q1_count\"})\n",
    "train1 = train1.merge(question_feature, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
    "    .drop(\"qid\", axis=1) \\\n",
    "    .rename(columns={\"q_count\": \"q2_count\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test1 = test.merge(question_feature, how=\"left\", left_on=\"q1\", right_on=\"qid\") \\\n",
    "    .drop(\"qid\", axis=1) \\\n",
    "    .rename(columns={\"q_count\": \"q1_count\"})\n",
    "test1 = test1.merge(question_feature, how=\"left\", left_on=\"q2\", right_on=\"qid\") \\\n",
    "    .drop(\"qid\", axis=1) \\\n",
    "    .rename(columns={\"q_count\": \"q2_count\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def question_feature(data):\n",
    "    data[\"word1_len\"], data[\"word2_len\"] = data[\"words1\"].map(len), data[\"words2\"].map(len)\n",
    "    data[\"char1_len\"], data[\"char2_len\"] = data[\"chars1\"].map(len), data[\"chars2\"].map(len)\n",
    "    data[\"word_same\"] = data.apply(lambda x: len(set(x[\"words1\"]).intersection(set(x[\"words2\"]))), axis=1)\n",
    "    data[\"char_same\"] = data.apply(lambda x: len(set(x[\"chars1\"]).intersection(set(x[\"chars2\"]))), axis=1)\n",
    "    data[\"q1_hash\"], data[\"q2_hash\"] = data[\"q1\"].map(question_dict), data[\"q2\"].map(question_dict)\n",
    "    return data\n",
    "    \n",
    "train2, test2 = question_feature(train1), question_feature(test1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "graph = networkx.Graph()\n",
    "edges = [tuple(pair) for pair in pd.concat([train_data[[\"q1\", \"q2\"]], test_data[[\"q1\", \"q2\"]]]).values]\n",
    "graph.add_edges_from(edges)\n",
    "\n",
    "cliques = sorted(list(networkx.find_cliques(graph)), key=lambda x: len(x), reverse=True)\n",
    "map_label = dict(((x[0], x[1]), 1) for x in pd.concat([train_data[[\"q1\", \"q2\"]], test_data[[\"q1\", \"q2\"]]]).values)\n",
    "\n",
    "map_clique_size = {}\n",
    "for c in cliques:\n",
    "    for q1, q2 in combinations(c, 2):\n",
    "        if (q1, q2) in map_label:\n",
    "            map_clique_size[q1, q2] = len(c)\n",
    "        elif (q2, q1) in map_label:\n",
    "            map_clique_size[q2, q1] = len(c)\n",
    "\n",
    "train2['clique_size'] = train2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)\n",
    "test2['clique_size'] = test2.apply(lambda row: map_clique_size.get((row['q1'], row['q2']), -1), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_kcore = pd.DataFrame(list(nx.core_number(graph).items()), columns=[\"qid\", \"kcore\"])\n",
    "train3 = train2.merge(max_kcore, how=\"left\", left_on=\"q1\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q1_kcore\"}) \\\n",
    "    .merge(max_kcore, how=\"left\", left_on=\"q2\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q2_kcore\"})\n",
    "test3 = test2.merge(max_kcore, how=\"left\", left_on=\"q1\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q1_kcore\"}) \\\n",
    "    .merge(max_kcore, how=\"left\", left_on=\"q2\", right_on=\"qid\").drop(\"qid\", axis=1).rename(columns={\"kcore\": \"q2_kcore\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train3.drop([\"q1\", \"q2\", \"words1\", \"chars1\", \"words2\", \"chars2\"], axis=1).to_csv(\"./data/train_feature.csv\", index=False)\n",
    "test3.drop([\"q1\", \"q2\", \"words1\", \"chars1\", \"words2\", \"chars2\"], axis=1).to_csv(\"./data/test_feature.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
